Regression Analysis

# Real GDP (Level) > Y

#Probability of Losing a Job >X3
#Job Openings per Unemployed Person >X4 
#Ratio of Hires to Openings > X5
#Business Survey > X6
#Business Survey (Peak Diff) > X7
#Yield Curve    > X8
#Excnomic Policy Index  > X9
#Equity Uncertainty > X10
#Unemployment Rate (YoY) >X11

LOGY <- log(Pull$Y) 
LOGY1 <- log(lag(Pull$Y,1))

AnnGDP <- (LOGY-LOGY1)*12
GDP_LOG <- log(Pull$Y) 

# X1 Functional Form Test

X1Squared <- Pull$X1*Pull$X1
X1Root <- sqrt(Pull$X1)
TestX1 <- lm(AnnGDP~Pull$X1+X1Squared)
TestX1
## 
## Call:
## lm(formula = AnnGDP ~ Pull$X1 + X1Squared)
## 
## Coefficients:
## (Intercept)      Pull$X1    X1Squared  
##  -2.582e-03    5.023e-05   -2.266e-08
stargazer(TestX1, type="text", title="Baseline Results", single.row=TRUE, 
          ci=TRUE, ci.level=0.95)
## 
## Baseline Results
## ==================================================
##                          Dependent variable:      
##                     ------------------------------
##                                 AnnGDP            
## --------------------------------------------------
## X1                    0.0001** (0.00000, 0.0001)  
## X1Squared           -0.00000*** (-0.00000, -0.000)
## Constant                -0.003 (-0.030, 0.025)    
## --------------------------------------------------
## Observations                     230              
## R2                              0.061             
## Adjusted R2                     0.053             
## Residual Std. Error        0.022 (df = 227)       
## F Statistic             7.354*** (df = 2; 227)    
## ==================================================
## Note:                  *p<0.1; **p<0.05; ***p<0.01
A1 <- (Pull$X1+X1Squared)
summary(TestX1)
## 
## Call:
## lm(formula = AnnGDP ~ Pull$X1 + X1Squared)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.097927 -0.009636  0.002610  0.012044  0.050861 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -2.582e-03  1.414e-02  -0.183  0.85526   
## Pull$X1      5.023e-05  2.315e-05   2.170  0.03106 * 
## X1Squared   -2.266e-08  8.719e-09  -2.599  0.00997 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02201 on 227 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.06085,    Adjusted R-squared:  0.05257 
## F-statistic: 7.354 on 2 and 227 DF,  p-value: 0.0008046
# X2 Functional Form Test

X2Squared <- Pull$X2*Pull$X2
X2Root <- sqrt(Pull$X2)
TestX2 <- lm(AnnGDP~Pull$X2)
TestX2
## 
## Call:
## lm(formula = AnnGDP ~ Pull$X2)
## 
## Coefficients:
## (Intercept)      Pull$X2  
##   3.376e-02   -4.293e-05
stargazer(TestX2, type="text", title="Baseline Results", single.row=TRUE, 
          ci=TRUE, ci.level=0.95)
## 
## Baseline Results
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                               AnnGDP           
## -----------------------------------------------
## X2                  -0.00004 (-0.0001, 0.00005)
## Constant              0.034** (0.004, 0.063)   
## -----------------------------------------------
## Observations                    230            
## R2                             0.004           
## Adjusted R2                   -0.001           
## Residual Std. Error      0.023 (df = 228)      
## F Statistic             0.866 (df = 1; 228)    
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01
A2 <- Pull$X2

# X3 Functional Form Test

X3Squared <- Pull$X3*Pull$X3
X3Root <- sqrt(Pull$X3)
TestX3 <- lm(AnnGDP~Pull$X3)
TestX3
## 
## Call:
## lm(formula = AnnGDP ~ Pull$X3)
## 
## Coefficients:
## (Intercept)      Pull$X3  
##   -0.007321     0.143167
stargazer(TestX3, type="text", title="Baseline Results", single.row=TRUE, 
          ci=TRUE, ci.level=0.95)
## 
## Baseline Results
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                               AnnGDP           
## -----------------------------------------------
## X3                     0.143 (-0.060, 0.346)   
## Constant              -0.007 (-0.046, 0.031)   
## -----------------------------------------------
## Observations                    230            
## R2                             0.008           
## Adjusted R2                    0.004           
## Residual Std. Error      0.023 (df = 228)      
## F Statistic             1.907 (df = 1; 228)    
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01
A3 <- Pull$X3

# X4 Functional Form Test

X4Squared <- Pull$X4*Pull$X4
X4Root <- sqrt(Pull$X4)
TestX4 <-lm(AnnGDP~Pull$X4+X4Squared)
TestX4
## 
## Call:
## lm(formula = AnnGDP ~ Pull$X4 + X4Squared)
## 
## Coefficients:
## (Intercept)      Pull$X4    X4Squared  
##     0.01160      0.02178     -0.01011
stargazer(TestX4, type="text", title="Baseline Results", single.row=TRUE, 
          ci=TRUE, ci.level=0.95)
## 
## Baseline Results
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                               AnnGDP           
## -----------------------------------------------
## X4                     0.022 (-0.022, 0.065)   
## X4Squared             -0.010 (-0.041, 0.021)   
## Constant              0.012* (-0.001, 0.025)   
## -----------------------------------------------
## Observations                    230            
## R2                             0.013           
## Adjusted R2                    0.004           
## Residual Std. Error      0.023 (df = 227)      
## F Statistic             1.446 (df = 2; 227)    
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01
A4=(Pull$X4+X4Squared)

# X5 Functional Form Test

X5Squared <- Pull$X5*Pull$X5
X5Root <- sqrt(Pull$X5)
TestX5 <- lm(AnnGDP~Pull$X5+X5Squared)
TestX5
## 
## Call:
## lm(formula = AnnGDP ~ Pull$X5 + X5Squared)
## 
## Coefficients:
## (Intercept)      Pull$X5    X5Squared  
##     0.09613     -0.12869      0.05224
stargazer(TestX5, type="text", title="Baseline Results", single.row=TRUE, 
          ci=TRUE, ci.level=0.95)
## 
## Baseline Results
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                               AnnGDP           
## -----------------------------------------------
## X5                   -0.129** (-0.253, -0.005) 
## X5Squared             0.052* (-0.0004, 0.105)  
## Constant              0.096*** (0.025, 0.167)  
## -----------------------------------------------
## Observations                    230            
## R2                             0.020           
## Adjusted R2                    0.012           
## Residual Std. Error      0.022 (df = 227)      
## F Statistic            2.355* (df = 2; 227)    
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01
A5 <- Pull$X5+X5Squared


# X6 Functional Form Test
X6Squared <- Pull$X6*Pull$X6
X6Root <- sqrt(Pull$X6)
TestX6 <- lm(AnnGDP~Pull$X6+X6Squared)
TestX6
## 
## Call:
## lm(formula = AnnGDP ~ Pull$X6 + X6Squared)
## 
## Coefficients:
## (Intercept)      Pull$X6    X6Squared  
##  -32.322274     0.638887    -0.003154
stargazer(TestX6, type="text", title="Baseline Results", single.row=TRUE, 
          ci=TRUE, ci.level=0.95)
## 
## Baseline Results
## =================================================
##                          Dependent variable:     
##                     -----------------------------
##                                AnnGDP            
## -------------------------------------------------
## X6                     0.639*** (0.403, 0.874)   
## X6Squared            -0.003*** (-0.004, -0.002)  
## Constant            -32.322*** (-44.027, -20.617)
## -------------------------------------------------
## Observations                     230             
## R2                              0.379            
## Adjusted R2                     0.374            
## Residual Std. Error       0.018 (df = 227)       
## F Statistic            69.400*** (df = 2; 227)   
## =================================================
## Note:                 *p<0.1; **p<0.05; ***p<0.01
A6 <- Pull$X6+X6Squared

# X7 Functional Form Test
X7Squared <- Pull$X7*Pull$X7
X7Root <- sqrt(Pull$X7)
## Warning in sqrt(Pull$X7): NaNs produced
TestX7 <- lm(AnnGDP~Pull$X7+X7Squared)
TestX7
## 
## Call:
## lm(formula = AnnGDP ~ Pull$X7 + X7Squared)
## 
## Coefficients:
## (Intercept)      Pull$X7    X7Squared  
##    0.009741    -1.620990   -33.953005
stargazer(TestX7, type="text", title="Baseline Results", single.row=TRUE, 
          ci=TRUE, ci.level=0.95)
## 
## Baseline Results
## =================================================
##                          Dependent variable:     
##                     -----------------------------
##                                AnnGDP            
## -------------------------------------------------
## X7                   -1.621*** (-2.705, -0.537)  
## X7Squared           -33.953*** (-46.696, -21.210)
## Constant                0.010 (-0.012, 0.032)    
## -------------------------------------------------
## Observations                     230             
## R2                              0.379            
## Adjusted R2                     0.374            
## Residual Std. Error       0.018 (df = 227)       
## F Statistic            69.400*** (df = 2; 227)   
## =================================================
## Note:                 *p<0.1; **p<0.05; ***p<0.01
A7 <- Pull$X7+X7Squared

# X8 Functional Form Test
X8Squared <- Pull$X8*Pull$X8
X8Root <- sqrt(Pull$X8)
## Warning in sqrt(Pull$X8): NaNs produced
TestX8 <- lm(AnnGDP~Pull$X8)
TestX8
## 
## Call:
## lm(formula = AnnGDP ~ Pull$X8)
## 
## Coefficients:
## (Intercept)      Pull$X8  
##   0.0206399   -0.0004445
stargazer(TestX8, type="text", title="Baseline Results", single.row=TRUE, 
          ci=TRUE, ci.level=0.95)
## 
## Baseline Results
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                               AnnGDP           
## -----------------------------------------------
## X8                    -0.0004 (-0.003, 0.002)  
## Constant              0.021*** (0.015, 0.026)  
## -----------------------------------------------
## Observations                    230            
## R2                            0.0005           
## Adjusted R2                   -0.004           
## Residual Std. Error      0.023 (df = 228)      
## F Statistic             0.112 (df = 1; 228)    
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01
A8 <- Pull$X8


# X9 Functional Form Test
X9Squared <- Pull$X9*Pull$X9
X9Root <- sqrt(Pull$X9)
TestX9 <- lm(AnnGDP~Pull$X9+X9Squared)
TestX9
## 
## Call:
## lm(formula = AnnGDP ~ Pull$X9 + X9Squared)
## 
## Coefficients:
## (Intercept)      Pull$X9    X9Squared  
##   2.740e-02    2.865e-05   -8.043e-07
stargazer(TestX9, type="text", title="Baseline Results", single.row=TRUE, 
          ci=TRUE, ci.level=0.95)
## 
## Baseline Results
## ================================================
##                         Dependent variable:     
##                     ----------------------------
##                                AnnGDP           
## ------------------------------------------------
## X9                   0.00003 (-0.0002, 0.0003)  
## X9Squared           -0.00000 (-0.00000, 0.00000)
## Constant              0.027*** (0.011, 0.044)   
## ------------------------------------------------
## Observations                    230             
## R2                             0.150            
## Adjusted R2                    0.142            
## Residual Std. Error       0.021 (df = 227)      
## F Statistic           19.961*** (df = 2; 227)   
## ================================================
## Note:                *p<0.1; **p<0.05; ***p<0.01
A9 <- Pull$X9+X9Squared

# X10 Functional Form Test
X10Squared<- Pull$X10*Pull$X10
X10Root <- sqrt(Pull$X10)
## Warning in sqrt(Pull$X10): NaNs produced
TestX10 <- lm(AnnGDP~Pull$X10)
TestX10
## 
## Call:
## lm(formula = AnnGDP ~ Pull$X10)
## 
## Coefficients:
## (Intercept)     Pull$X10  
##   2.228e-02   -6.671e-05
stargazer(TestX10, type="text", title="Baseline Results", single.row=TRUE, 
          ci=TRUE, ci.level=0.95)
## 
## Baseline Results
## ==================================================
##                          Dependent variable:      
##                     ------------------------------
##                                 AnnGDP            
## --------------------------------------------------
## X10                 -0.0001*** (-0.0001, -0.00004)
## Constant               0.022*** (0.019, 0.025)    
## --------------------------------------------------
## Observations                     230              
## R2                              0.134             
## Adjusted R2                     0.130             
## Residual Std. Error        0.021 (df = 228)       
## F Statistic            35.240*** (df = 1; 228)    
## ==================================================
## Note:                  *p<0.1; **p<0.05; ***p<0.01
A10 <- Pull$X10


# X11 Functional Form Test
X11Squared <- Pull$X11*Pull$X11
X11Root <- sqrt(Pull$X11)
## Warning in sqrt(Pull$X11): NaNs produced
TestX11 <- lm(AnnGDP~Pull$X11)
TestX11
## 
## Call:
## lm(formula = AnnGDP ~ Pull$X11)
## 
## Coefficients:
## (Intercept)     Pull$X11  
##   0.0202694   -0.0004122
stargazer(TestX11, type="text", title="Baseline Results", single.row=TRUE, 
          ci=TRUE, ci.level=0.95)
## 
## Baseline Results
## ================================================
##                         Dependent variable:     
##                     ----------------------------
##                                AnnGDP           
## ------------------------------------------------
## X11                 -0.0004*** (-0.001, -0.0003)
## Constant              0.020*** (0.018, 0.023)   
## ------------------------------------------------
## Observations                    230             
## R2                             0.130            
## Adjusted R2                    0.126            
## Residual Std. Error       0.021 (df = 228)      
## F Statistic           33.987*** (df = 1; 228)   
## ================================================
## Note:                *p<0.1; **p<0.05; ***p<0.01
A11 <- Pull$X11



#First Test of All Best Form Variables

CurrentGDP <- lm(GDP_LOG~ A3+A4+A5+A6+A7+A8+A9+A10+A11)
CurrentGDP
## 
## Call:
## lm(formula = GDP_LOG ~ A3 + A4 + A5 + A6 + A7 + A8 + A9 + A10 + 
##     A11)
## 
## Coefficients:
## (Intercept)           A3           A4           A5           A6           A7  
##  -5.780e+00    7.050e-01    1.926e-02   -1.235e-01    1.442e-03   -2.978e+01  
##          A8           A9          A10          A11  
##   2.490e-03    1.465e-06   -8.492e-05    1.058e-03
stargazer(CurrentGDP, type="text", title="Baseline Results", single.row=TRUE, 
          ci=TRUE, ci.level=0.95)
## 
## Baseline Results
## =================================================
##                          Dependent variable:     
##                     -----------------------------
##                                GDP_LOG           
## -------------------------------------------------
## A3                      0.705 (-0.171, 1.582)    
## A4                      0.019 (-0.006, 0.044)    
## A5                   -0.123*** (-0.146, -0.101)  
## A6                      0.001 (-0.003, 0.005)    
## A7                   -29.781 (-120.189, 60.627)  
## A8                      0.002 (-0.012, 0.017)    
## A9                  0.00000*** (0.00000, 0.00000)
## A10                 -0.0001** (-0.0002, -0.00001)
## A11                   0.001*** (0.0003, 0.002)   
## Constant              -5.780 (-49.358, 37.798)   
## -------------------------------------------------
## Observations                     231             
## R2                              0.687            
## Adjusted R2                     0.674            
## Residual Std. Error       0.059 (df = 221)       
## F Statistic            53.878*** (df = 9; 221)   
## =================================================
## Note:                 *p<0.1; **p<0.05; ***p<0.01
# Gathering all predictors 

predictors <- cbind(A3,A4,A5,A6,A7,A8,A9,A10,A11,GDP_LOG) %>%
  as.data.frame()

saveRDS(predictors, "./models/predictors.RDS")
# function to automate lag calculation
lagged_predictors <- function(n_lags, predictor_df, n_predictors=10){
  pred_df <- sapply(1:n_predictors, function(x){
    lag(predictor_df[,x],n_lags)
  } )
  pred_df <- as.data.frame(pred_df)
  return(pred_df)
}

# one month lag
predictors_oml <- lagged_predictors(1,predictors)
predictors_oml <- cbind(GDP_LOG, predictors_oml) %>%
  drop_na()

# sampling then testing
testIDs <- sample(1:dim(predictors_oml)[1], 0.8*dim(predictors_oml)[1])
training_predictors_oml <- predictors_oml[testIDs,]
testing_predictors_oml <- predictors_oml[-testIDs,]
gdp_oml_lm <- lm(GDP_LOG ~., data = training_predictors_oml)
summary(gdp_oml_lm)
## 
## Call:
## lm(formula = GDP_LOG ~ ., data = training_predictors_oml)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0074757 -0.0008033  0.0000831  0.0009680  0.0040170 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.275e+00  6.422e-01  -3.542 0.000511 ***
## V1           4.548e-02  1.295e-02   3.511 0.000569 ***
## V2           8.145e-04  3.626e-04   2.246 0.025940 *  
## V3          -2.366e-05  4.154e-04  -0.057 0.954639    
## V4           2.105e-04  5.909e-05   3.562 0.000475 ***
## V5          -4.673e+00  1.335e+00  -3.500 0.000591 ***
## V6          -1.745e-04  2.088e-04  -0.836 0.404482    
## V7           2.490e-08  1.220e-08   2.040 0.042839 *  
## V8          -2.942e-06  1.180e-06  -2.494 0.013576 *  
## V9          -5.252e-06  1.102e-05  -0.477 0.634114    
## V10          9.977e-01  1.947e-03 512.381  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001514 on 173 degrees of freedom
## Multiple R-squared:  0.9998, Adjusted R-squared:  0.9998 
## F-statistic: 8.348e+04 on 10 and 173 DF,  p-value: < 2.2e-16
pred_gdp_log_oml <- predict.lm(gdp_oml_lm, newdata = testing_predictors_oml)

ggplot() + 
  geom_point(aes(x = testing_predictors_oml$GDP_LOG, y = pred_gdp_log_oml)) + 
  geom_abline(intercept = 0, slope = 1) + 
  labs(x = "Actual Logged GDP",
       y = "Predicted Logged GDP")

training_res <- training_predictors_oml$GDP_LOG - predict(gdp_oml_lm, training_predictors_oml)
holdout_res <- testing_predictors_oml$GDP_LOG - predict(gdp_oml_lm, testing_predictors_oml)
# t-test to check whether predicted residuals are different from zero 
t.test(holdout_res, mu = 0)
## 
##  One Sample t-test
## 
## data:  holdout_res
## t = 1.0921, df = 45, p-value = 0.2806
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.0002186426  0.0007366422
## sample estimates:
##    mean of x 
## 0.0002589998
# test whether holdout residuals and training residuals are the same
t.test(training_res, holdout_res)
## 
##  Welch Two Sample t-test
## 
## data:  training_res and holdout_res
## t = -0.99308, df = 65.123, p-value = 0.3243
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.0007798451  0.0002618455
## sample estimates:
##    mean of x    mean of y 
## 5.502845e-15 2.589998e-04

Data Processing for Random Forest

Relevant time scale: Monthly
Datasets: Electricity generation, Rigs count, GDP, OECD Business Tendency Survey, Percentage of Civilian Workforce Unemployed >15 weeks

## pct labor force unemloyed for more than 15 weeks
LT_unemployed <- fredr(series_id = "U1RATE", observation_start = as.Date("1973-01-01"))
## yield curve 10year minue 3month
yield_curve <- fredr(series_id = "T10Y3M", observation_start=as.Date('1947-01-01'))

## business tendenc
business_tendenc <- fredr(series_id = "BSCICP03USM665S", observation_start = as.Date("1973-01-01"))

## finding local peaks where m is the number of points on either side of the peak 
find_peaks <- function (x, m = 3){
  shape <- diff(sign(diff(x, na.pad = FALSE)))
  pks <- sapply(which(shape < 0), FUN = function(i){
    z <- i - m + 1
    z <- ifelse(z > 0, z, 1)
    w <- i + m + 1
    w <- ifelse(w < length(x), w, length(x))
    if(all(x[c(z : i, (i + 2) : w)] <= x[i + 1])) return(i + 1) else return(numeric(0))
  })
  pks <- unlist(pks)
  pks
}

distance_from_peak <- function(current_position, peaks_vec, time_series_value){
  if (current_position > peaks_vec[1]){
  distance <- peaks_vec-current_position
  closest <- min(distance)
  distance_pct <- (time_series_value[current_position] - time_series_value[current_position+closest])/time_series_value[current_position+closest]
  distance_pct <- ifelse(is.null(distance_pct) == T, 0, distance_pct)
  distance_pct
  } else { distance_pct <- 0
  distance_pct}
}

business_local_peaks <- find_peaks(business_tendenc$value)
distance_peak <- sapply(1:dim(business_tendenc)[1], function(x){
  distance_pct <- distance_from_peak(x, business_local_peaks, business_tendenc$value)
  distance_pct
})

business_tendencies <- cbind(business_tendenc, distance_peak)
business_tendencies[is.na(business_tendencies)] <- 0
write.csv(business_tendencies, "./data/business_tendencies.csv")
business_sentiment <- business_tendencies

## Electricity Data
# hourly demand

lower48_hourly_demand <- paste("http://api.eia.gov/series/?series_id=EBA.US48-ALL.D.H&api_key=", eia_api_key, sep = "") %>%
  fromJSON() 
lower48_hourly_demand_list <- lower48_hourly_demand$series %>%
  select(data)%>%
  .[[1]] 
date_time <- lower48_hourly_demand_list[[1]][,1]
demand <- lower48_hourly_demand_list[[1]][,2]
lower48_hourly_demand_df <-  cbind(date_time, demand) %>%
  data.frame(stringsAsFactors = F) %>%
  mutate(date = as_date(ymd(substr(date_time, 1, 8))),
         hour = as.numeric(substr(date_time, 10, 11)))

## all generation
# last updated 4/26
all_electricity_generation_monthly <- eia_series(id = "TOTAL.ELETPUS.M")
all_electricity_generation_monthly_df <- unnest(all_electricity_generation_monthly, cols="data")
all_electricity_generation_monthly_df <- all_electricity_generation_monthly_df %>%
  select(series_id, units, value, date, year, month)

all_electricity_generation_monthly_df <- all_electricity_generation_monthly_df %>%
  select(series_id, value, date, year, month) %>%
  mutate(terawatt_value = value/1000, 
         units = "Terawatthours",
         date = as.yearmon(unique(substr(date, 1,7))))

# hourly generation
lower48_hourly_generation <- eia_series(id = "EBA.US48-ALL.NG.H")
lower48_hourly_generation_df <- unnest(lower48_hourly_generation, cols = "data") %>%
  select(series_id, units, value, date, year, month)

# monthly generation from hourly

lower48_monthly_generation_df <- lower48_hourly_generation_df %>%
  group_by(year, month) %>%
  summarise(value = (sum(value))/1000000,
            units = "Terawatthours",
            date = unique(substr(date, 1,7)))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
lower48_monthly_generation_df$date <- as.yearmon(lower48_monthly_generation_df$date)

# intersecting all states with lower 48 | then calculate prop lower48 to extrapolate retroactively
intersecting_months <- inner_join(lower48_monthly_generation_df, all_electricity_generation_monthly_df, by = "date")
colnames(intersecting_months)[3] <- "lower48_terawatt" 
intersecting_months$proportion <- intersecting_months$lower48_terawatt/intersecting_months$terawatt_value
lower48_prop <- fivenum(intersecting_months$proportion)[3]
all_electricity_generation_monthly_extrapolated_df <- all_electricity_generation_monthly_df %>%
  mutate(lower48_terawatt = terawatt_value*lower48_prop) %>%
  filter(date < "Jul 2015") %>%
  select(year, month, lower48_terawatt, units, date)
colnames(all_electricity_generation_monthly_extrapolated_df)[3] <- "value"
lower48_monthly_generation_extrapolated_df <- bind_rows(all_electricity_generation_monthly_extrapolated_df, lower48_monthly_generation_df)
write.csv(lower48_monthly_generation_extrapolated_df, "./data/monthly_electricity.csv")
electricity <- lower48_monthly_generation_extrapolated_df

## crude oil and nat gas rigs in operation 
total_rigs <- eia_series(id = "TOTAL.OGNRPUS.M")
total_rigs_df <- unnest(total_rigs, cols = "data") %>%
  select(series_id, units, value, date, year, month)
total_rigs_df <- total_rigs_df %>%
  arrange(date) %>%
  mutate(change_first_order = round((value - lag(value, 1))/lag(value,1),4),
         change_second_order = ifelse(lag(change_first_order,1) == 0, 0, (change_first_order - lag(change_first_order, 1))/lag(change_first_order,1)))
write.csv(total_rigs_df, "./data/monthly_rigs_count.csv")
rigs <- total_rigs_df
gdp_complete <- gdp %>%
  mutate(date = as.yearmon(substr((Date), 1, 7)),
         `GDP (Continuously Compounding)` = `GDP (Continuously Compounding)`*100,
         `GDP (YoY % Change)` = `GDP (YoY % Change)`*100, 
         gdp_change_lagged = lag(`GDP (Continuously Compounding)`, 1)) %>%
  filter(date >= "Jan 1973") %>%
  select(date, `GDP (YoY % Change)`, `GDP (Continuously Compounding)`, gdp_change_lagged)
electricity_complete <- electricity %>%
  mutate(date = as.yearmon(date),
         rate_generated_change = (value - lag(value,1))/lag(value,1),
         rate_generated_change_secondorder = (rate_generated_change - lag(rate_generated_change,1))/lag(rate_generated_change,1)) %>%
  select(date, rate_generated_change, rate_generated_change_secondorder) 
rigs_complete <- rigs %>%
  mutate(date = as.yearmon(substr(date, 1, 7))) %>%
  select(date, change_first_order, change_second_order)
colnames(rigs_complete)[2:3] <- c("rigs_change_first_order", "rigs_change_second_order")
business_sentiment <- business_sentiment %>%
  mutate(date = as.yearmon(substr(date, 1, 7)),
         sentiment_change_first_order = (value - lag(value,1))/lag(value,1)) %>%
  select(date, value, distance_peak, sentiment_change_first_order)
colnames(business_sentiment)[2] <- "business_sentiment"
sentiment_period <- rep(0, dim(business_sentiment)[1])
for (i in 2:dim(business_sentiment)[1]){
  sentiment_i = business_sentiment$sentiment_change_first_order[i]
  #print(sentiment_i)
  sentiment_h = business_sentiment$sentiment_change_first_order[i-1]
  #print(sentiment_h)
  sentiment_h[is.na(sentiment_h)] <- 0
  if(sign(sentiment_i) == sign(sentiment_h)){
    sentiment_period[i] = sentiment_period[i-1]+1
  }
  else{
    sentiment_period[i] = 1
  }
}
business_sentiment$sentiment_period <- sentiment_period

LT_unemployed_complete <- LT_unemployed %>%
  mutate(date = as.yearmon(substr(date, 1, 7)),
         rate_change_first_order = (value - lag(value,1))/lag(value,1)) %>%
  select(date, value, rate_change_first_order)
colnames(LT_unemployed_complete)[2] <- "LT_unemployment"
rate_change_period <- rep(0, dim(LT_unemployed_complete)[1])
threshold_period <- rep(0, dim(LT_unemployed_complete)[1])
for (i in 2:dim(LT_unemployed_complete)[1]){
  sentiment_i = LT_unemployed_complete$rate_change_first_order[i]
  #print(sentiment_i)
  sentiment_h = LT_unemployed_complete$rate_change_first_order[i-1]
  #print(sentiment_h)
  sentiment_h[is.na(sentiment_h)] <- 0
  if(sign(sentiment_i) == sign(sentiment_h)|sign(sentiment_i)==0){
    rate_change_period[i] = rate_change_period[i-1]+1
  }
  else{
   rate_change_period[i] = 1
  }
  if(LT_unemployed_complete$LT_unemployment[i] <= 1.5){
    threshold_period[i] = threshold_period[i-1]+1
  }
}
LT_unemployed_complete$threshold_period <- threshold_period
LT_unemployed_complete$rate_change_period <- rate_change_period

yield_curve[is.na(yield_curve)] <- 0
yield_curve <- yield_curve %>%
  drop_na() %>%
  mutate(date = as.yearmon(substr(date, 1, 7)),
         value = ifelse(value < 0, -1, 1)) %>%
  group_by(date) %>%
  summarise(value = min(value)) %>%
  mutate(value = as.factor(value))
colnames(yield_curve)[2] <- "curve"

# inner join all together and to get recession dates
econ_situation_nber_df <- inner_join(gdp_complete, electricity_complete, by = "date")
econ_situation_nber_df <- inner_join(econ_situation_nber_df, rigs_complete, by = "date")
econ_situation_nber_df <- inner_join(econ_situation_nber_df, business_sentiment, by = "date")
econ_situation_nber_df <- inner_join(econ_situation_nber_df, LT_unemployed_complete, by = "date")
NBER_recession_months <- econ_situation_nber_df$date %>%
  .[c(11:27, 85:90, 103:119, 211:219, 339:347, 420:438)]
econ_situation_nber_df <- inner_join(econ_situation_nber_df, yield_curve, by = "date")
econ_situation_nber_df <- econ_situation_nber_df %>%
  mutate(NBER_recession = ifelse(date %in% NBER_recession_months, "Y", "N"),
         NBER_recession_next = as.factor(lead(NBER_recession,1)),
         NBER_recession_two = as.factor(lead(NBER_recession,2))
         ) %>%
  as.data.frame() %>%
  drop_na()

DT::datatable(econ_situation_nber_df)

Random Forest Modeling

Two random forest models were trained. One to predict the probability of a recession the next month, and another for the following month. A previously-trained model was loaded to save time when knitting this document.

# creating test-train data
testIDs <- createDataPartition(econ_situation_nber_df$NBER_recession_next, p = 0.7, list = F)
train_x <- econ_situation_nber_df[testIDs,c(4:17)]
train_y <- econ_situation_nber_df$NBER_recession_next[testIDs]
test_x <- econ_situation_nber_df[-testIDs, c(4:17)]
test_y <- econ_situation_nber_df$NBER_recession_next[-testIDs] 

# model tuning 
trctrl <- trainControl(method = "repeatedcv", 
                       number = 10, repeats = 5, search = "grid")
mtry <- ncol(train_x)
ntrees <- 101
tunegrid <- expand.grid(.mtry = c(2:mtry))
metric = "Accuracy"
# following can be commented out if loading a pretrained model from file 
# parallel processing to speed things up
# random forest model 
cl <- makePSOCKcluster(5)
registerDoParallel(cl)
rf_recession <- train(x = train_x, y = train_y, method = "rf", metric = metric, trControl = trctrl, tuneGrid = tunegrid, ntree = ntrees)
stopCluster(cl)
saveRDS(rf_recession, "./models/rf_1mth_all.RDS")
rf_recession <- readRDS("./models/rf_1mth_all.RDS")
predict_y <- predict(rf_recession, newdata = test_x)
predict_y_prob <- predict(rf_recession,newdata = test_x, "prob")
predict_y_prob$date <- econ_situation_nber_df$date[-testIDs]
predict_y_prob$actual <- test_y
predict_y_prob$predicted <- predict_y
cm_1mth_all <- confusionMatrix(predict_y, reference = test_y)
saveRDS(cm_1mth_all, "./models/confusion_matrix_1mth_all.RDS")
cm_1mth_all$overall
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##   0.9779411765   0.8842224745   0.9368890738   0.9954276320   0.8970588235 
## AccuracyPValue  McnemarPValue 
##   0.0002902758   1.0000000000
prob_model <- ggplotly(ggplot(predict_y_prob, aes(x = date, y = Y)) +
                         geom_point(aes(color = actual)) + 
                         geom_hline(yintercept = 0.50) + 
                         labs(y = "Probability", x = "Date", colour = "NBER", 
                              title = "Probability of a Recession a Month Ahead", subtitle = "Probabilistic Prediction of a Random Forest Model"))
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## ℹ Please use `gather()` instead.
## ℹ The deprecated feature was likely used in the plotly package.
##   Please report the issue at <]8;;https://github.com/plotly/plotly.R/issueshttps://github.com/plotly/plotly.R/issues]8;;>.
saveRDS(prob_model, "./models/ggplot_prob_model.RDS")
prob_model
# creating test-train data
testIDs <- createDataPartition(econ_situation_nber_df$NBER_recession_two, p = 0.7, list = F)
train_x <- econ_situation_nber_df[testIDs,c(4:17)]
train_y <- econ_situation_nber_df$NBER_recession_two[testIDs]
test_x <- econ_situation_nber_df[-testIDs, c(4:17)]
test_y <- econ_situation_nber_df$NBER_recession_two[-testIDs] 

# model tuning 
trctrl <- trainControl(method = "repeatedcv", 
                       number = 10, repeats = 5, search = "grid")
mtry <- ncol(train_x)
ntrees <- 101
tunegrid <- expand.grid(.mtry = c(2:mtry))
metric = "Accuracy"
# following can be commented out if loading a pretrained model from file 
# parallel processing to speed things up
# random forest model 
cl <- makePSOCKcluster(5)
registerDoParallel(cl)
rf_recession2 <- train(x = train_x, y = train_y, method = "rf", metric = metric, trControl = trctrl, tuneGrid = tunegrid, ntree = ntrees)
stopCluster(cl)
saveRDS(rf_recession2, "./models/rf_2mth_all.RDS")
rf_recession2 <- readRDS("./models/rf_2mth_all.RDS")
predict_y <- predict(rf_recession2, newdata = test_x)
predict_y_prob <- predict(rf_recession2,newdata = test_x, "prob")
predict_y_prob$date <- econ_situation_nber_df$date[-testIDs]
predict_y_prob$actual <- test_y
predict_y_prob$predicted <- predict_y
cm_2mth_all <- confusionMatrix(predict_y, reference = test_y)
saveRDS(cm_2mth_all, "./models/confusion_matrix_2mth_all.RDS")
cm_2mth_all$byClass
##          Sensitivity          Specificity       Pos Pred Value 
##            0.9836066            0.7692308            0.9756098 
##       Neg Pred Value            Precision               Recall 
##            0.8333333            0.9756098            0.9836066 
##                   F1           Prevalence       Detection Rate 
##            0.9795918            0.9037037            0.8888889 
## Detection Prevalence    Balanced Accuracy 
##            0.9111111            0.8764187
prob_model_2 <- ggplotly(ggplot(predict_y_prob, aes(x = date, y = Y)) +
                         geom_point(aes(color = actual)) + 
                         geom_hline(yintercept = 0.50) + 
                         labs(y = "Probability", x = "Date", colour = "NBER", 
                              title = "Probability of a Recession 2 Months Ahead", subtitle = "Probabilistic Prediction of a Random Forest Model"))
saveRDS(prob_model_2, "./models/ggplot_prob_model_2.RDS")
prob_model_2
gdp_change_lagged <- -0.655704
test_data_2 <- cbind(gdp_change_lagged, electricity_complete[567,2:3], rigs_complete[567,2:3], business_sentiment[567,2:5], LT_unemployed_complete[567, 2:5], yield_curve[459,2])
predict_y_prob_april20 <- predict(rf_recession,newdata = test_data_2, "prob")
predict_y_prob_may20 <- predict(rf_recession2,newdata = test_data_2, "prob")
econ_state <- c("Expansion", "Contraction")
predict_y_prob_april20
##             N         Y
## 567 0.7227723 0.2772277
predict_y_prob_may20
##             N         Y
## 567 0.8811881 0.1188119
forecast_df2 <- cbind(econ_state, t(predict_y_prob_april20), t(predict_y_prob_may20)) %>%
  data.frame()
colnames(forecast_df2) <- c("Econ. State", "April 2020", "May 2020")
kable(forecast_df2, caption = "R.F. Predictions based on March 2020") %>%
  kable_styling(bootstrap_options = c("striped","hover", "condensed"))
R.F. Predictions based on March 2020
Econ. State April 2020 May 2020
N Expansion 0.722772277227723 0.881188118811881
Y Contraction 0.277227722772277 0.118811881188119

Visualizing Random Forest

The following selects a decision tree from the random forest model and plots it. tree_func adopted wholesale from here

# 

tree_func <- function(final_model, 
                      tree_num) {
  
  # get tree by index
  tree <- randomForest::getTree(final_model, 
                                k = tree_num, 
                                labelVar = TRUE) %>%
    tibble::rownames_to_column() %>%
    # make leaf split points to NA, so the 0s won't get plotted
    mutate(`split point` = ifelse(is.na(prediction), `split point`, NA))
  
  # prepare data frame for graph
  graph_frame <- data.frame(from = rep(tree$rowname, 2),
                            to = c(tree$`left daughter`, tree$`right daughter`))
  
  # convert to graph and delete the last node that we don't want to plot
  graph <- graph_from_data_frame(graph_frame) %>%
    delete_vertices("0")
  
  # set node labels
  V(graph)$node_label <- gsub("_", " ", as.character(tree$`split var`))
  V(graph)$leaf_label <- as.character(tree$prediction)
  V(graph)$split <- as.character(round(tree$`split point`, digits = 2))
  
  # plot
  plot <- ggraph(graph, 'dendrogram') + 
    theme_bw() +
    geom_edge_link() +
    geom_node_point() +
    geom_node_text(aes(label = node_label), na.rm = TRUE, repel = TRUE) +
    geom_node_label(aes(label = split), vjust = 2.5, na.rm = TRUE, fill = "white") +
    geom_node_label(aes(label = leaf_label, fill = leaf_label), na.rm = TRUE, 
                    repel = TRUE, colour = "white", fontface = "bold", show.legend = FALSE) +
    theme(panel.grid.minor = element_blank(),
          panel.grid.major = element_blank(),
          panel.background = element_blank(),
          plot.background = element_rect(fill = "white"),
          panel.border = element_blank(),
          axis.line = element_blank(),
          axis.text.x = element_blank(),
          axis.text.y = element_blank(),
          axis.ticks = element_blank(),
          axis.title.x = element_blank(),
          axis.title.y = element_blank(),
          plot.title = element_text(size = 18))
  
  print(plot)
}

tree_num <- which(rf_recession$finalModel$forest$ndbigtree == min(rf_recession$finalModel$forest$ndbigtree))
tree_plot <- tree_func(final_model = rf_recession$finalModel, tree_num[1])

saveRDS(tree_plot, "./models/tree_plot.RDS")